# inputs
dataset_path <- "./Data/"
function_path <- "./Functions/"
# outputs
itrt_plot_path <- "./Output/InteractivePlot/"
sttc_plot_path <- "./Output/StaticPlot/"
out_data_path <- "./Output/Data/"
daily_covid <- import(
paste0(dataset_path, "worldometer_coronavirus_daily_data.csv")
)
summary_covid <- import(
paste0(dataset_path, "worldometer_coronavirus_summary_data.csv")
)
head(daily_covid)
## date country cumulative_total_cases daily_new_cases active_cases
## 1 2020-02-15 Afghanistan 0 NA 0
## 2 2020-02-16 Afghanistan 0 NA 0
## 3 2020-02-17 Afghanistan 0 NA 0
## 4 2020-02-18 Afghanistan 0 NA 0
## 5 2020-02-19 Afghanistan 0 NA 0
## 6 2020-02-20 Afghanistan 0 NA 0
## cumulative_total_deaths daily_new_deaths
## 1 0 NA
## 2 0 NA
## 3 0 NA
## 4 0 NA
## 5 0 NA
## 6 0 NA
head(summary_covid)
## country continent total_confirmed total_deaths total_recovered
## 1 Afghanistan Asia 158275 7367 145750
## 2 Albania Europe 213257 3228 202077
## 3 Algeria Africa 220415 6310 151347
## 4 Andorra Europe 25289 141 21511
## 5 Angola Africa 86636 1789 67477
## 6 Anguilla North America 1777 6 1702
## active_cases serious_or_critical total_cases_per_1m_population
## 1 5158 1124 3932
## 2 7952 23 74227
## 3 62758 34 4893
## 4 3637 31 326512
## 5 17370 7 2518
## 6 69 NA 116869
## total_deaths_per_1m_population total_tests total_tests_per_1m_population
## 1 183 826810 20541
## 2 1124 1495002 520354
## 3 140 230861 5125
## 4 1820 249838 3225714
## 5 52 1296669 37686
## 6 395 51382 3379283
## population
## 1 40250878
## 2 2873049
## 3 45046063
## 4 77452
## 5 34407243
## 6 15205
daily_covid <-
daily_covid %>%
replace(is.na(.), 0) %>%
mutate(date = as.Date(date))
(x <- subset(daily_covid, is.na(cumulative_total_deaths)) %>% group_by(country) %>% summarise(n=length(country)) %>% select(country))
## # A tibble: 0 x 1
## # ... with 1 variable: country <chr>
(y <- pull(x, country))
## character(0)
(subset(daily_covid, country == y))
## [1] date country cumulative_total_cases
## [4] daily_new_cases active_cases cumulative_total_deaths
## [7] daily_new_deaths
## <0 rows> (or 0-length row.names)
What is the overview of covid cases during that period of time?
## global percentage of death, active case and recovered ##
# sum vertically
categories <- c("total_deaths", "total_recovered", "active_cases")
category <- str_replace_all(categories, pattern = "_", replacement = " ")
category <- str_to_title(category)
data <-
summary_covid[, categories] %>%
colSums(na.rm = T)
data <- data.frame(
category=category,
count=data
)
data$prettyCount <- prettyNum(data$count, big.mark = ",", scientific = F)
# Compute percentages
data$fraction <- data$count / sum(data$count)
# Compute the cumulative percentages (top of each rectangle)
data$ymax <- cumsum(data$fraction)
# Compute the bottom of each rectangle
data$ymin <- c(0, head(data$ymax, n=-1))
# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2
# Compute display percentages
data$prettyFraction <- percent(data$fraction)
# Make the plot
q1 <-
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
geom_rect() +
geom_text(
x=4.3,
aes(y=labelPosition, label=prettyCount, color=category, fontface="bold"),
size=3.5
) + # x here controls label position (inner / outer)
geom_text(
x=3.5,
aes(y=labelPosition, label=prettyFraction, fontface="bold"),
color="white",
size=4
) +
scale_fill_brewer(palette="Set2") +
scale_color_brewer(palette="Set2") +
coord_polar(theta="y") +
xlim(c(2, 4)) +
theme_void() +
annotate(
geom = "text",
x = 2,
y = 0,
colour = "#eba834",
label = paste0(
"Total Cases\n",
prettyNum(sum(data$count), big.mark = ",", scientific = F)
)
)
ggsave(paste0(sttc_plot_path, "q1_pie.png"))
## Saving 7 x 5 in image
q1
What are the general scale of infection within different continent or country?
## comparison of cases between different continent ##
q2 <-
summary_covid %>% # data
select(country:active_cases) %>%
group_by(continent) %>% # group_by
filter(total_confirmed > quantile(total_confirmed, 0.7)) %>% # removing small cases
ungroup() %>%
group_by(continent, country) %>%
# turning 3 columes into sub sub group (wide to long conversion)
gather(category, count, total_recovered, active_cases, total_deaths, factor_key=T) %>%
ungroup() %>%
mutate(category = factor(category, labels = c("Recovered", "Active Cases", "Deaths"))) %>%
treemap( index=c("continent","country","category"),
vSize="count",
type="index",
palette = "Set2",
title = "Group by continent top 70 percentile confirmed cases",
align.labels=list(
c("center", "center"),
c("left", "top"),
c("left", "bottom")
)
)
itrt_q2 <- d3tree2( q2 , rootname = "Group by continent top 70 percentile confirmed cases" )
saveWidget(itrt_q2, file = paste0(itrt_plot_path, "q2"))
itrt_q2
How many people suffered from covid?
# overview of accumulated cases vs date for all the country
# global stacked area plot
data <-
daily_covid %>%
group_by(date) %>%
summarise(
cumulative_total_cases = sum(cumulative_total_cases, na.rm = T),
cumulative_total_deaths = sum(cumulative_total_deaths, na.rm = T),
) %>%
gather(categories, count,
cumulative_total_cases, cumulative_total_deaths) %>%
# adding this section of code, the dataframe looks fine
rowwise() %>%
mutate(text =
paste(
last(strsplit("abc_abc1_abc2", "_")[[1]]),
"Count:", count,
"\nDate:", date
)
) %>%
# section end
arrange(date) # this is just to check if text is appended properly
# group_by(date, categories)
# write.csv(data, paste0(out_data_path, "temp.csv"))
# (subset(data, is.na(text))) # enable this to check if something is na
facet_labels <- c(
'cumulative_total_cases'="Cumulative Cases",
'cumulative_total_deaths'="Cumulative Deaths"
)
q3 <- # I can't use any function in the text argument \|/
ggplot(data, aes(x=date, y=count, group=categories, fill=categories, text = text)) +
geom_area() +
facet_wrap(~categories, scales = "free_y", labeller = as_labeller(facet_labels)) +
scale_fill_viridis(discrete = T, option="B", begin = 0.3, end = 0.7) +
scale_x_date(date_labels = "%b %Y") +
scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
theme(legend.position="none") +
ggtitle("Cumulative Covid Cases") +
# scale_fill_discrete(name = "Categories", labels = c("Cumulative Cases", "Cumulative Death Cases")) +
ylab("Covid Cases") +
xlab("Date") +
theme_ipsum() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust = 1))
q3
itrt_q3 <- ggplotly(q3, tooltip = "text")
itrt_q3
# hand picked countries multiple of stacked area plots
Which country has the most cases?
# ranking of cases for the top n countries
# hist
How active cases have change during that period of time for each country?
# video hist
We can easily tell from previous plots that most of the cases are from big countries. What is the relation between population and Covid cases?
# we can easily tell from previous plots that most of the cases are from big
# countries. Now, I'm curious about the relation between population & Covid cases
q11 <- summary_covid %>%
# Reorder countries to having big bubbles at the back
arrange(desc(population)) %>%
# prepare text for tooltip
mutate(text =
paste0(
"Country: ", country,
"\nPopulation: ", commaNum(population),
"\nTotal Cases:\t", commaNum(total_confirmed),
"\nTotal Tests\t", commaNum(total_tests)
)
) %>%
ggplot(
aes(
x = population,
y = total_confirmed,
fill = continent,
size = total_tests,
text = text
)
) +
geom_point(alpha=0.5, color = "black", shape = 21, na.rm = T) +
scale_x_log10(
labels = unit_format(unit = "M", scale = 1e-6),
breaks = 1e+3 * 10^(seq(0,20,2)),
) +
scale_y_log10(
labels = unit_format(unit = "M", scale = 1e-6),
breaks = 10^(seq(1,21,2)),
) +
scale_size(range = c(2, 25), name="Total Tests (M), Size") +
labs(fill = 'Continent, Color') +
scale_fill_viridis(discrete=T, option = "D") +
coord_cartesian(clip = "off") +
ylab("Covid Cases (M), log10(n)") +
xlab("Population (M), log10(n)") +
theme_bw()
ggsave(paste0(sttc_plot_path, "q11_bubble.png"))
## Saving 12 x 6 in image
# turn interactive ggplot with plotly and save it
itrt_q11 <- ggplotly(q11, tooltip="text")
saveWidget(itrt_q11, file = paste0(itrt_plot_path, "q11"))
q11
itrt_q11
Which country did well in this pandemic war?
can weather, geometric position affect the spread of covid?
an even more in depth question, we’ve seen that population definitely affect how covid spread, let’s check and see why social distancing is needed.
How did the R rate behave starting from the beginning of covid?
# pacman::p_unload(all)